#message=FALSE means some output messages aren't printed in the knitted HTML document
knitr::opts_chunk$set(echo = FALSE, warning=FALSE)
#include=FALSE, echo=FALSE and warning=FALSE ensure the code and any error messages aren't shown in the output HTML file
#message=FALSE means no output messages are printed on the knitted HMTL file
#loading any necessary packages
library(dplyr)
library(tidyverse)
library(knitr)
library(readr)
library(ggplot2)
library(bbmle)
library(mizer)
library(Hmisc)
library(lme4)
library(RColorBrewer)
library(viridis)
library(hexbin)
load("stomach_dataset.Rdata")
#loading the dataset to be used in analysis (the dataset is already named stom_df)
df <- stom_df %>% transmute(haul_id, ices_rectangle, year, pred_species,
pred_weight_g, pred_length_cm, prey_weight_g,
prey_type = prey_funcgrp, indiv_prey_weight = prey_ind_weight_g,
prey_count, n_stomachs, no._prey_per_stmch = prey_count/n_stomachs, ppmr)
#creating a new data set called 'df' which only contains certain selected columns
df <- df[df$indiv_prey_weight != 0, ]
df <- df[df$pred_weight_g != 0, ]
df <- df[df$indiv_prey_weight != Inf, ]
#Removes any points where the prey weight = Inf or 0, or the predator weight = 0
renamed_df = df %>%
mutate(pred_species = replace(pred_species,
pred_species == "Micromesistius poutassou", "Blue Whiting")) %>%
# mutate(pred_species = replace(pred_species, pred_species == "Capros apers", "Boarfish")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Gadus morhua", "Cod")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Limanda limanda", "Common Dab")) %>%
mutate(pred_species = replace(pred_species,
pred_species == "Merluccius merluccius", "European Hake")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Melanogrammus aeglefinus", "Haddock")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Clupea harengus", "Herring")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trachurus trachurus", "Horse Mackerel")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Scomber scombrus", "Mackerel")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Lepidorhombus whiffiagonis", "Megrim")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Lophius piscatorius", "Monkfish")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trisopterus esmarkii", "Norway Pout")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Pleuronectes platessa", "Plaice")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trisopterus minutus", "Poor Cod")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Solea solea", "Sole")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Sprattus sprattus", "Sprat")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Merlangius merlangus", "Whiting"))
#Renaming the predator species from their latin names (e.g. Capros apers) to their more common names (e.g. Boarfish)
#Creates a new data frame (called 'renamed_df') with these replaced names
species_list <- c("Blue Whiting", "Cod", "Common Dab", "European Hake", "Haddock", "Herring",
"Horse Mackerel", "Mackerel", "Megrim", "Monkfish", "Norway Pout", "Plaice",
"Poor Cod", "Sole", "Sprat", "Whiting")
#Creates an array called 'species_list', which is list of the predator species we are focusing on in this project
renamed_df <- renamed_df[renamed_df$pred_species %in% species_list, ]
#Removes any observations of predator species not in the species_list, i.e. they are irrelevant data points for this project
renamed_df$lppmr <- log(renamed_df$ppmr)
renamed_df$lprey_weight <- log(renamed_df$indiv_prey_weight)
renamed_df$lpred_weight <- log(renamed_df$pred_weight_g)
#Adds columns which take the log value of ppmr, individual prey weight and predator weight to the main data set
This data set is formed from recordings taken by multiple ships between the years 1886-2016.
Fish were taken from some location, and their individual stomach contents recorded. Predators of the same species and (roughly) the same weight/size were recorded as a single data point (the number of predators for each data set is called ‘n_stomachs’). Prey of (roughly) the same size found in these stomachs were recorded in this same data point, and the total number of prey in some data point is called ‘prey_count’. For each individual data point,
\[ \text{no. prey per stomach} = \frac{\text{no. of stomachs sampled}}{\text{total no. of prey}} \] This is an approximation of the number of prey per stomach for some specific predator, and is called ’ no._prey_per_stmch’.
Using similar logic,‘prey_weight_g’ is the total weight of prey in the multiple stomachs sampled to make up one datapoint. ‘indiv_prey_weight’ is then the weight of one single prey individual found in the stomach
There are some other column names which are useful to note:
ggplot(renamed_df, aes(x=prey_type, fill=prey_type)) +
geom_bar(stat="count", width=0.7) +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) +
scale_fill_brewer(palette = "Dark2") +
labs(title = "Distribution of prey types for each predator", x="Prey type", y="Count")
#geom_bar() creates a bar chart from the specified data
#facet_wrap() means an individual bar chart is created for each individual predator species
#'theme()' removes any words on the x-axis so the graphs are easier to understand
#scale_fill_brewer changes the colour palette of the plot so that the colours are easy to distinguish between
These are individual bar charts showing the distribution of the type of prey each predator eats.
The prey types are:
We can see that some predators eat a range of prey types (e.g. Whiting), while others prefer to eat a single prey type in abundance (e.g. European Hake and Monkfish mostly consume fish as their prey).
ggplot(data = renamed_df, aes(lprey_weight, no._prey_per_stmch)) +
labs(title = "log(prey weight) v. number of prey per predator stomach",
x="log(prey weight)", y="No. of prey per predator stomach") +
geom_point(size=0.5)
#geom_point() adds individual points which show individual recorded points
#size=0.5 defines the size of each point on this graph
This graph is looking at the distribution of the weight of prey recorded, i.e. looking at what is the most common prey weight over all prey species. It is a graph over all the data points, so includes recordings from all the ships involved.
There are some potentially interesting results, so results from different ships were plotted on individual curves to further look into these results.
renamed_df$'haul_id_short' <- gsub("\\-.*", "", renamed_df$'haul_id')
renamed_df$'haul_id_short' <- gsub("_", "", renamed_df$'haul_id_short')
#the haul_id values are renamed to be shortened versions of the ship names (e.g. CLYDE) rather than the complete id (e.g. CLYDE-1935-6)
interesting_haul <- filter(renamed_df,
haul_id_short=='CLYDE'|haul_id_short=='END04'|haul_id_short=='LUC'|
haul_id_short=='HIDDINK'|haul_id_short=='EXCmacDATSTO815'|
haul_id_short=="Excmacdatsto815error")
#interesting_haul is a new data frame which only contains values recorded by ships which gave "interesting" looking datapoints
haul_list_interesting <- unique(interesting_haul$haul_id_short)
#creates an array containing every individual (renamed) ship name that we are interested in
ggplot (data = interesting_haul, aes(x=lprey_weight, y=no._prey_per_stmch)) +
labs(title = "log(prey weight) v. number of prey per stomach - separated by ship names",
x="log(prey weight)", y="No. prey per stomach") +
geom_point(size=0.02, colour="red") +
theme(strip.text = element_text(size = 5)) +
facet_wrap(~interesting_haul$haul_id_short, scale="free_y")
These six graphs show a range of unusual looking points:
Though we will not remove or alter the data set to account for these potentially erroneous data points in this project, for further analysis it may be sensible to remove the data points these six ships recorded so that final analysis is as reliable as possible.
#message=FALSE means some output messages aren't printed in the knitted HTML document
ggplot(data = renamed_df, aes(lprey_weight, lpred_weight)) +
labs(title = "log(Predator mass) v. log(prey mass) plot",
x="log(Prey mass)", y="log(Predator mass)") +
geom_hex(bins = 50) +
scale_fill_gradient2(low = "black", mid="orange", high = "red", midpoint=600) +
stat_smooth(method='lm', se=FALSE, colour="blue")
# stat_smooth adds a line of best fit plotted through the points
# method='lm' ensures it is a straight line, i.e. a linear model
# se=FAlSE creates only a single line, with no error of margin included
#geom_hex creates "bins" in the data where the density is calculated over
#bins=50 means that there are 50 "bins" in the horizontal direction and 50 in the vertical
#scale_fill_gradient2 creates a density colour scale, meaning that areas with a high density of points are red and areas with a low density of points are darker in colour
slope <- coef(lm(renamed_df$lpred_weight~renamed_df$lprey_weight))
paste("slope of the log(pred) v. log(prey) line of best fit:", slope[2])
## [1] "slope of the log(pred) v. log(prey) line of best fit: 0.489336522809739"
# coef(lm()) creates an array containing the coefficients of the linear model of the relationship between two axes
# The first item of the array gives out the y-intercept of the line, and the second item of the array is the gradient
We are attempting to find a link between the predator mass and the prey mass. Log() of each axis is used to see the proportionality of the axes, where the slope of the added line should = 1. This is because:
\[ \log (\text{pred weight}) = m \times \text{log(prey weight)} + c \]
Where m is the gradient of the slope and c is the y-intercept (thinking of this graph as a linear model).
Taking the exponential of both sides,
\[ \text{pred weight} = \exp({{m}\times{\log(\text{prey weight})}}) + \log(D) \] where log(d) = c.Â
Finally,
\[ \text{pred weight} = \text{prey weight}^m \times D \]
Hence, we want m=1 to show that pred weight proportional to prey weight (as expected).
For this graph, the slope is not equal to one. However, this graph is plotted over the entire data set and includes values from all of the available predator species.
Instead, separate the plots by predator species to see if there is some proportionality between the predator and prey weight for each specific species.
ggplot(data = renamed_df, aes(lprey_weight, lpred_weight)) +
labs(title = "log(pred. mass) v. log(prey mass) separated by predator species",
x="log(prey mass)", y="log(pred. mass)") +
geom_point(size=0.2) +
facet_wrap(~pred_species, scale="free_y") +
theme(strip.text = element_text(size = 10)) +
stat_smooth (method='lm', se=FALSE, colour="red")
i <- 1
species_grad =c()
#setting i=1 for the while loop
while(i<=length(species_list)){
species_df <- renamed_df %>% filter(pred_species == fixed(species_list[i]))
grad <- coef(lm((species_df$lpred_weight)~(species_df$lprey_weight)))
species_grad[i] <- grad[2]
i=i+1
}
#'while' means that this function repeats through the length of 'species_list' until all predator species are accounted for
#i=i+1 increases the value of i each time through this loop to motivate this repeat
#this creates a data frame (species_df) containing each indivdual predator species, then calculates the gradient of the log(pred weight) v. log(prey weight) graph for this individual predator species, then inserts this gradient value to the ith value of an array named 'species_grad'
pvp_grads <- data.frame(pred_species=c(species_list),
gradient=c(species_grad))
kable(pvp_grads)
| pred_species | gradient |
|---|---|
| Blue Whiting | -0.0258049 |
| Cod | 0.8343249 |
| Common Dab | 0.1217269 |
| European Hake | 0.7230261 |
| Haddock | 0.2604050 |
| Herring | 0.2220153 |
| Horse Mackerel | 0.2290883 |
| Mackerel | 0.0156493 |
| Megrim | 0.3020678 |
| Monkfish | 0.7009350 |
| Norway Pout | 0.4455201 |
| Plaice | 0.4548590 |
| Poor Cod | 0.1394437 |
| Sole | 0.1141953 |
| Sprat | 0.0381477 |
| Whiting | 0.6664210 |
#creates a new data frame name 'pvp_grads', with one column named 'pred_species' and the other named 'gradient'
#each row contains the name of some individual predator species and its associated gradient
#kable prints the information in the pvp_grads
Here, we are splitting up the graphs over each individual predator species to look and see if there is a proportionality for each specific species.
We are wanting gradient=1 for proportionality. This is only (approximately) true for the species Cod and Monkfish. Although this is a very limited number of the predator species, the assumption that the predator and prey masses are proportional is a reasonable one, so we will continue wiht this assumption.
ggplot(data=renamed_df, aes(lpred_weight, lppmr)) +
geom_point(size=0.001) +
labs(title = "log(pred mass) v. log(ppmr) plot", x="log(Pred mass)", y="log(PPMR)") +
stat_smooth (method='lm', se=FALSE) +
facet_wrap(~pred_species, scale="free_y")
Graphing log(pred mass) v. log(ppmr) for individual predator species to see if the predator mass related to the ppmr.
\[ \log(\text{pred mass}) = m \times \log (\text{prey mass}) + c \]
where m is the gradient (assuming a linear model between the variables) and c is the y-intercept.
We want them to not be proportional (i.e. slope, m= 0) to prove that log(predator mass) is not related to the log(ppmr) (and hence the ppmr is independent of the predator mass for a certain species).
species_grad_ppmr=c()
j <- 1
#creates an empty vector called 'species_grad_ppmr'
#setting j=1 for the while loop
while(j<=length(species_list)){
species_df <- renamed_df %>% filter(pred_species == fixed(species_list[j]))
grad <- coef(lm((species_df$lppmr)~(species_df$lpred_weight)))
species_grad_ppmr[j] <- grad[2]
j=j+1
}
ppmrvp_grads <- data.frame(pred_species=c(species_list),
gradient=c(species_grad_ppmr))
kable(ppmrvp_grads)
| pred_species | gradient |
|---|---|
| Blue Whiting | 1.2032364 |
| Cod | 0.2709018 |
| Common Dab | 0.2435919 |
| European Hake | 0.1617011 |
| Haddock | 0.2581471 |
| Herring | 0.7232217 |
| Horse Mackerel | 0.4273000 |
| Mackerel | 0.7621392 |
| Megrim | -0.2055083 |
| Monkfish | 0.1811160 |
| Norway Pout | 0.0973909 |
| Plaice | 0.3061335 |
| Poor Cod | -0.0895529 |
| Sole | 0.6335299 |
| Sprat | 0.4688564 |
| Whiting | 0.2891959 |
For the assumption to be upheld, we need gradient=0. This is (approximately) satisfied for the species Cod, Common Dab, Euorpean Hake, Haddock, Megrim, Monkfish, Norway Pout and Poor Cod. Although none of these are exactly equal to 0, they are sufficiently close to continue with the assumption that the ppmr is independent of the predator mass for the named predator species.
species_df <- renamed_df %>% filter(pred_species == fixed("Poor Cod"))
#creating a data frame only containing observations where the predator species is poor cod.
ggplot(data=species_df, aes(lpred_weight, lppmr)) +
geom_point(size=0.5) +
labs(title = "log(pred mass) v. log(ppmr) plot: Poor Cod", x="log(Pred mass)", y="log(PPMR)") +
facet_wrap(~pred_species, scale="free_y") +
stat_smooth(aes(weight=no._prey_per_stmch, colour='by no. of prey in stomach'), method='lm', se=FALSE) +
stat_smooth(aes(colour='no weighting'), method='lm', se=FALSE) +
stat_smooth(aes(weight=prey_weight_g, colour='by prey bio'), method='lm', se=FALSE) +
stat_summary(fun.data=mean_se, geom="linerange") +
scale_colour_manual(name="Weightings",
values=c('by no. of prey in stomach'='blue',
'no weighting'='red', 'by prey bio'='green'))
# se=FALSE doesn't add an area of error around the LOBF
# fun.data=mean_se calculates the mean and standard error for each point
# linerange draws a point range between an upper and lower limit for the line, and the mean for the point (using the values calcuated in 'fun.data=mean_se')
paste("slope of the log(ppmr) v. log(pred_weight) line of best fit for poor cod:")
## [1] "slope of the log(ppmr) v. log(pred_weight) line of best fit for poor cod:"
#weighted by number of prey per stomach
perstmch_weighting <-
coef(lm(species_df$lppmr~species_df$lpred_weight, weight=species_df$no._prey_per_stmch))
paste("i) Weighted by no prey in the stomach:", perstmch_weighting[2])
## [1] "i) Weighted by no prey in the stomach: -1.64313946193691"
#no weighting
no_weighting <- coef(lm(species_df$lppmr~species_df$lpred_weight))
paste("ii) No weighting in the calculation:", no_weighting[2])
## [1] "ii) No weighting in the calculation: -0.0895529388001952"
#weighted by prey biomass
biomass_weighting <-
coef(lm(species_df$lppmr~species_df$lpred_weight, weight=species_df$prey_weight_g))
paste("iii) Weighted by prey biomass:", biomass_weighting[2])
## [1] "iii) Weighted by prey biomass: 0.0116668819369047"
Looking at just the predator species ‘poor cod’. There are three LOBF (line of best fit), weighted by different variables:
When weighted by prey biomass, the gradient is equal to zero for 2 s.f.. Hence, the approximation (of slope=0) is true when looking at a prey biomass weighting.
Here, we are looking for the most common log(PPMR) for each individual species. This will be done by plotting log(PPMR) against the density of points, and weighting observations on different variables. By assuming these are normally distributed relations, there should be a ‘peak’ point of log(ppmr) which is where the mean/most common log(ppmr) value lies. We assume this will be different for different species of predator, i.e. each predator has a preferred ppmr value and hence each predator type has a preferred relative size of prey.
ggplot(data = renamed_df, aes(x=lppmr)) +
labs(title = "log(PPMR) v. biomass density of prey", x="log(PPMR)", y="Biomass density of prey") +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(strip.text = element_text(size = 5)) +
geom_density(aes(weight = prey_weight_g), colour="red")
#using facet_wrap for the variable (pred_species) means the data is separated into individual plots for each predator species
#geom_density means area under the curve = 1 (i.e. the graph is normalised)
#weight=prey_weight_g means the points are 'weighted' by the column weight (biomass) of each individual prey
These plots are weighted by the prey biomass. This means that we are looking at the distribution of prey biomass across values of log(PPMR), so points with a larger biomass are prioritised/weighted more than those with a smaller biomass.
ggplot(data = renamed_df, aes(x=lppmr)) +
labs(title = "log(PPMR) v. number density of prey", x="log(PPMR)", y="Number density of prey") +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(strip.text = element_text(size = 5)) +
geom_density(aes(weight = no._prey_per_stmch), colour="red")
#weight=no._prey_per_stmch means the points are 'weighted' by the number of prey per stomach
These plots are weighted by the number pf prey per stomach. This means that data points with a larger number of prey per stomach are prioritised/weighted more than those with a smaller number of prey per stomach.
By looking at the two differently weighted graphs, it is clear to see that the plots are ‘shifted’ by some amount (e.g. the Blue Whiting when weighted by prey biomass has a mean log(ppmr) of roughly 4.8, and when weighted by number of prey this mean becomes roughly 7.5).
Here, we take only observations relating to the predator type Herring. This allows us to do more specific analysis about the shifted mean and begin to explain some of the maths behind this shifting.
herringDF <- renamed_df %>%
filter(pred_species == fixed("Herring"))
#creates a separate data set (called 'herringDF') only containing observations for the predator species 'Herring'
bio_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$prey_weight_g, na.rm = TRUE)
bio_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$prey_weight_g, na.rm = TRUE))
#weighted.mean gives the arithmetic mean of log(ppmr), where datapoints are weighted by the prey weight of observations
#similarly, wtd.var is the variance of log(ppmr), where the datapoints are also weighted by the prey weight
#standard deviation is defined as the square root of the variance
#na.rm=TRUE means any rows with missing values (values that equal 'na') aren't included in the mean/variance calculations, but are instead ignored
paste("log(ppmr) mean, weighted by biomass of prey:", bio_herringmean)
## [1] "log(ppmr) mean, weighted by biomass of prey: 6.62917673459157"
paste("Standard deviation of this:", bio_herringSD)
## [1] "Standard deviation of this: 2.3508789268221"
ggplot(data = herringDF, aes(x=lppmr)) +
labs(title = "log(ppmr) against biomass density for Herring, weighted by prey biomass",
x="log(PPMR)",y="Biomass density of observations") +
geom_density(aes(weight = prey_weight_g), colour="blue") +
theme(plot.title = element_text(size=15)) +
stat_function(fun = dnorm, args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD))) +
xlim(-5,17)
#stat_function adds a normal distribution curve (fun=dnorm) with a mean and standard deviation equal to what was calculated earlier for this data set
#xlim sets limits for the x-axis so that all the necessary data points can be seen
The curve with points weighted by the prey biomass is plotted in blue, and a normal curve (also weighted by prey biomasss) is plotted over the top in black.
no_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE)
no_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE))
#the mean and variance are both weighted by the number of prey
paste("log(ppmr) mean, weighted by number of observations:", no_herringmean)
## [1] "log(ppmr) mean, weighted by number of observations: 13.5410236346048"
paste("Standard deviation of this:", no_herringSD)
## [1] "Standard deviation of this: 2.63330496940788"
ggplot(data = herringDF, aes(x=lppmr), group=1) +
labs(title = "log(ppmr) against number density for Herring, weighted by no. of prey",
x="log(PPMR)", y="No. density of observations") +
geom_density(aes(weight = no._prey_per_stmch), colour="red") +
theme(plot.title = element_text(size=15)) +
stat_function(fun = dnorm, args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD))) +
xlim(0,25)
#the xlim is different to the graph above because these data points lie over a slightly different log(ppmr) range
The curve with points weighted by the number of prey per stomach is plotted in red, and a normal curve (also weighted by the number of prey per stomach) is plotted over the top in black.
ggplot(data = herringDF, aes(x=lppmr), group=1) +
labs(title = "log(ppmr) against biomass/number density for Herring,
with different weightings",
x="log(PPMR)", y="Number/biomass density of observations") +
geom_density(aes(weight = no._prey_per_stmch, colour="prey biomass")) +
geom_density(aes(weight = prey_weight_g, colour="number of prey per stomach")) +
stat_function(fun = dnorm, args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD))) +
stat_function(fun = dnorm, args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD))) +
theme(plot.title = element_text(size=15)) +
scale_color_manual(name='Weightings',
values=c('number of prey per stomach'='red', 'prey biomass'='blue')) +
xlim(-5,25)
#scale_color_manual manually adds a key to the graph to describe what the differently coloured lines represent
#each curve is given a name using 'aes(colour="")', and these are then given a specific colour using 'values='
This is a graph with both the the distribution of Herring log(PPMR) as weighted by prey biomass and number of prey plotted over each other (along with appropriately weighted normal distribution curves plotted over the top of each).
Prey biomass weighted mean: 6.629177 No. of prey weighted mean: 13.54102
Prey biomass standard deviation: 2.3508789268221 No. of prey standard deviation: 2.63330496940788
The mean is shifted by 6.911843.
The mathematics of shifted means for this difference in weighting is:
\[ \text{weighted mean}_{\text{expected prey biomass}} = \text{weighted mean}_{\text{no. of prey }} - (\text{standard deviation}_{\text{no. of prey }})^2 \]
Hence, the expected result is:
\[ \text{weighted mean}_{\text{actual prey biomass}} = 13.54102 - (2.63330496940788)^2 = 6.60672493809 \] There is a difference in expected and actual prey biomass weighted mean of 0.30511806191. This is a fairly insignificant value, which suggests the shifting mean equations are relatively accurate for this data set with these weightings.
Therefore, we can continue with the assumption that there is a difference in the mean of log(ppmr) of the square of the standard deviation when we weight by number of prey per stomach and prey biomass.
renamed_df %>%
ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) +
geom_line(size=1) + theme_classic()
null <- lmer(lppmr ~ (1|pred_species), data = renamed_df, REML=FALSE)
#the 'null' model has no predictors (i.e. no linear effects)
#REML=FALSE means the model is made using 'maximum likelihood' (rather than Restricted Maximum Likelihood), which allows us to compare different models
#Generally: lmer(fixed ~ (1|random) + linear)
renamed_df %>%
mutate(lppmr = fitted(null)) %>%
ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
geom_line(size=1)
one <- lmer(lppmr ~ lprey_weight + (1|pred_species), data = renamed_df, REML=FALSE)
#same slope for every effect (but varying intercept)
renamed_df %>%
mutate(lppmr= fitted(one)) %>%
ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
geom_line(size=1)
#couldn't introduce (1+pred_species|haul_id_short) or (1+lprey_weight|haul_id_short)
two <- lmer(lppmr ~ lprey_weight + (0+lprey_weight|pred_species), data = renamed_df, REML=FALSE)
#Random intercept and random slope (independent)
renamed_df %>%
mutate(lppmr = fitted(two)) %>%
ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
geom_line(size=1)
three <- lmer(lppmr ~ lprey_weight + (1+lprey_weight|pred_species), data = renamed_df, REML=FALSE)
#Random intercept and random slope (correlated)
renamed_df %>%
mutate(lppmr = fitted(three)) %>%
ggplot(aes(x=lprey_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
geom_line(size=1)
anova(null,one,two,three)
## Data: renamed_df
## Models:
## null: lppmr ~ (1 | pred_species)
## one: lppmr ~ lprey_weight + (1 | pred_species)
## two: lppmr ~ lprey_weight + (0 + lprey_weight | pred_species)
## three: lppmr ~ lprey_weight + (1 + lprey_weight | pred_species)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null 3 1188058 1188090 -594026 1188052
## one 4 1084789 1084831 -542391 1084781 103271 1 < 2.2e-16 ***
## two 4 1047701 1047743 -523847 1047693 37088 0
## three 6 1024125 1024188 -512057 1024113 23580 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#comparing the four models
Null model:
\[ \log(\text{ppmr})_{ij} = \beta_{0j} + r_{ij} = \gamma_{00} + u_{oj} + r_{ij} \]
\(\gamma_{00}\) is the intercept. \(u_{0j}\) is the effect of the random term (haul_id_short) $ r_{ij}$ is the error term.
a <- lmer(lppmr ~ lprey_weight + (1|haul_id_short), data = renamed_df, REML=FALSE)
b <- lmer(lppmr ~ lprey_weight + (1|haul_id_short) + (1|year), data = renamed_df, REML=FALSE)
#(1+lprey_weight|pred_species)
c <- lmer(lppmr ~ lprey_weight + (1|haul_id_short) + (1|year) + (1|ices_rectangle), data = renamed_df, REML=FALSE)
d <- lmer(lppmr ~ lprey_weight + (1|haul_id_short) + (1|year) + (1|ices_rectangle) + (1|pred_species), data = renamed_df, REML=FALSE)
e <- lmer(lppmr ~ lprey_weight + pred_species + (1|haul_id_short) + (1|year) + (1|ices_rectangle), data = renamed_df, REML=FALSE)
hist(resid(a), border='red', col="transparent", main = paste("Histogram of residuals" ))
hist(resid(b), border='green', col="transparent", add=TRUE)
hist(resid(c), border='blue', col="transparent", add=TRUE)
hist(resid(d), border='black', col="transparent", add=TRUE)
hist(resid(e), border='yellow', col="transparent", add=TRUE)
print(VarCorr(a),comp="Variance")
## Groups Name Variance
## haul_id_short (Intercept) 3.7532
## Residual 1.3358
#Only prints the variance estimates for the random effects
print(VarCorr(b),comp="Variance")
## Groups Name Variance
## haul_id_short (Intercept) 3.9630
## year (Intercept) 1.7655
## Residual 1.2824
print(VarCorr(c),comp="Variance")
## Groups Name Variance
## ices_rectangle (Intercept) 0.24284
## year (Intercept) 1.80010
## haul_id_short (Intercept) 3.11164
## Residual 1.20774
print(VarCorr(d),comp="Variance")
## Groups Name Variance
## ices_rectangle (Intercept) 0.18783
## year (Intercept) 2.03163
## haul_id_short (Intercept) 2.72256
## pred_species (Intercept) 0.84187
## Residual 1.15111
print(VarCorr(e),comp="Variance")
## Groups Name Variance
## ices_rectangle (Intercept) 0.18761
## year (Intercept) 2.02628
## haul_id_short (Intercept) 2.71335
## Residual 1.15104
#Variance increases??
AIC(a)
## [1] 837098.1
AIC(b)
## [1] 826114.8
AIC(c)
## [1] 773370.5
AIC(d)
## [1] 761133.9
AIC(d)
## [1] 761133.9
AIC(e)
## [1] 761051
#AIC decreases throughout
Mathematical representation of model:
\[ \hat{\log(\text{ppmr})}_{i} = \beta_{0j[i]} + \beta_{1j[i]} \times \hat{\text{lprey_weight}_{i}} + \epsilon \]
\(i\) is the individual observation. \(j\) is the haul_id. \(\epsilon\) is a general error term.
The lppmr for an observation (\(i\)) with lprey_weight=0 depends on their haul_id \(\beta_{0j[i]}\), and the rate of change of this value also varies by haul_id \(\beta_{j[i]}\):
The \(\beta_{0[i]}\) term represents the intercept of the observation (which varies based on the haul_id).
Variance of the groups decreases as other random effects are added to the models. But, the error term (standard error) roughly increases as more random effects are added to the model.
Fixed effect: log(ppmr) (the main effect we are tracking)
Random effects: haul_id_short, year, ices_rectangle
Linear effects: pred_species, prey_weight_g (what we assume will affect the fixed effect)
Weightings: prey_weight_g, no._prey_per_stmch
#Do we need weighting? Bc it's not a density plot
#trial_df <- data.frame(lppmr = 10:5,
# prey_weight_g = "x",
# lprey_weight = letters[1:6],
# no._prey_per_stmch = "y")
#weight_mixed <- lmer(lppmr ~ lprey_weight, data = herringDF, REML=FALSE)
#weighting observations by prey_weight_g
#two <- update(weight_mixed, weights = prey_weight_g)
#summary(two)
#weighting observations by no._prey_per_stmch
#three <- update(weight_mixed, weights = no._prey_per_stmch)
#summary(three)
#anova(two,three)